Setup R env. Load packages and set default image export formats, size and resolution.
knitr::opts_chunk$set(echo = TRUE,
fig.height = 8,
fig.width = 12,
dev = c("png", "pdf"),
dpi = 1000)
library(ggplot2)
library(ggtree)
## ggtree v3.6.2 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194
##
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
library(aplot)
library(treeio)
## treeio v1.22.0 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
##
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
##
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library(scales)
library(ggrepel)
library(stringr)
library(glue)
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:ggtree':
##
## rotate
options(scipen = 999) #Prevent scientific notation
Function to plot BUSCO categories - uses the standard BUSCO color scheme and also plots the percent for each category.
format_values_1 = function(values){
out <- c()
for(value in values) {
if(value != 0){
out <- c(out, sprintf(" %1.1f%%", value))
} else {
out <- c(out, "")
}
}
return(out)
}
plot_BUSCO_percentages = function(data, plot_title,
plot.axis.text.y=FALSE,
plot.legend=FALSE,
plot_size_ratio=1, plot_family="sans",
my_colors=c("#56B4E9", "#3492C7", "#F0E442", "#F04442") # Color pallet for BUSCO gene categories.
)
{
p <- data %>%
melt(id.var="SampleID") %>%
rename(BUSCO_categories = variable, Count = value) %>%
mutate(BUSCO_categories = factor(BUSCO_categories, c("Complete", "Single-copy", "Duplicated", "Fragmented", "Missing"))) %>%
filter(BUSCO_categories!="Complete") %>%
filter(BUSCO_categories!="Total") %>%
ggplot(aes(y = Count, x = SampleID, fill = BUSCO_categories)) +
geom_bar(position = position_stack(reverse = TRUE),
stat="identity") +
coord_flip() +
# geom_text_repel: https://ggrepel.slowkow.com/articles/examples.html
geom_text_repel(aes(label = format_values_1(Count)), # Add two spaces in from of text so it centers off the number not number%
size = rel(5)*plot_size_ratio,
fontface = "bold",
max.overlaps = Inf, #Set max.overlaps = Inf to override this behavior and always show all labels, regardless of too many overlaps.
position = position_stack(vjust=0.5, reverse = TRUE),
xlim = c(-Inf, Inf), # Set xlim or ylim to Inf or -Inf to disable repulsion away from the edges of the panel.
ylim = c(-Inf, Inf),
point.size = NA, #size of each point for each text label; Set point.size = NA to prevent label repulsion away from data points.
force=0.5, # force of repulsion between overlapping text labels
box.padding=0, # padding around the text label
direction="y") + # move text labels “both” (default), “x”, or “y” directions
theme_gray(base_size = 8) +
scale_y_continuous(labels = c("0","20","40","60","80","100"),
breaks = c(0,20,40,60,80,100)) +
scale_fill_manual(values = my_colors,
labels = c("Complete (C) and single-copy (S)",
"Complete (C) and duplicated (D)",
"Fragmented (F)",
"Missing (M)")) +
ggtitle(plot_title) +
xlab("") +
ylab("%BUSCOs") +
theme(plot.title = element_text(family = plot_family,
hjust=0.5,
colour = "black",
size = rel(2.2)*plot_size_ratio,
face = "bold")) +
theme(plot.margin=unit(c(0, 0, 0, 0), "mm")) +
theme(panel.background = element_rect(color = "#FFFFFF", fill = "white")) +
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major = element_blank()) +
theme(axis.text.y = element_text(family = plot_family, face = "bold", size = rel(1.66)*plot_size_ratio, colour = "black")) +
theme(axis.text.x = element_text(family = plot_family, face = "bold", size = rel(1.66)*plot_size_ratio, colour = "black")) +
theme(axis.line = element_line(linewidth = 1*plot_size_ratio, colour = "black")) +
theme(axis.ticks.length = unit(0.85*plot_size_ratio, "cm")) +
theme(axis.ticks.y = element_line(colour = "white", linewidth = 0)) +
theme(axis.ticks.x = element_line(colour = "#222222")) +
theme(axis.ticks.length = unit(0.4*plot_size_ratio, "cm")) +
theme(axis.title.x = element_text(family = plot_family, size = rel(2)*plot_size_ratio)) +
theme(legend.position = "bottom", legend.title = element_blank()) +
theme(legend.text = element_text(family = plot_family, size = rel(1.2)*plot_size_ratio)) +
theme(legend.key.size = unit(1.5*plot_size_ratio,"line")) +
guides(fill = guide_legend(override.aes = list(colour = NULL))) +
guides(fill = guide_legend(nrow = 1, byrow = TRUE))
if (!plot.axis.text.y) {
p <- p + theme(axis.text.y=element_blank())
}
if (!plot.legend) {
p <- p + theme(legend.position = "none")
}
return(p)
}
Function to plot basic single-category bar chart (not stacked). User provided color for bars.
format_values_2 = function(values){
out <- c()
for(value in values) {
if(value != 0){
out <- c(out, scales::comma(value))
} else {
out <- c(out, "")
}
}
return(out)
}
plot_barchart = function(data, column.id,
plot_title, xlab, bar.fill,
divide.by=1, y.log2=FALSE, y.log10=FALSE,
add.geom_text=TRUE, plot.axis.text.y=FALSE,
plot_size_ratio=1, plot_family="sans")
{
p <- data %>%
melt(id.var="SampleID") %>%
filter(variable==column.id) %>%
mutate(value = as.numeric(value)) %>%
mutate(value = value/divide.by) %>%
ggplot(aes(y = value, x = SampleID)) +
geom_bar(position = position_stack(),
stat="identity",
fill=bar.fill) +
scale_y_continuous(labels = comma) +
coord_flip() +
theme_gray(base_size = 8) +
ggtitle(plot_title) +
xlab("") +
ylab(xlab) +
theme(plot.title = element_text(family = plot_family,
hjust=0.5,
colour = "black",
size = rel(2.2)*plot_size_ratio,
face = "bold")) +
theme(plot.margin=unit(c(0, 0, 0, 0), "mm")) +
theme(legend.position="top",legend.title = element_blank()) +
theme(legend.text = element_text(family = plot_family, size = rel(1.2)*plot_size_ratio)) +
theme(legend.key.size = unit(1.5*plot_size_ratio,"line")) +
theme(panel.background = element_rect(color = "#FFFFFF", fill = "white")) +
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major = element_blank()) +
theme(axis.text.y = element_text(family = plot_family, face = "bold", size = rel(1.66)*plot_size_ratio, colour = "black")) +
theme(axis.text.x = element_text(family = plot_family, face = "bold", size = rel(1.66)*plot_size_ratio, colour = "black")) +
theme(axis.line = element_line(colour = "black", linewidth = 1*plot_size_ratio)) +
theme(axis.ticks.length = unit(0.85*plot_size_ratio, "cm")) +
theme(axis.ticks.y = element_line(colour = "white", linewidth = 0)) +
theme(axis.ticks.x = element_line(colour = "#222222")) +
theme(axis.ticks.length = unit(0.4*plot_size_ratio, "cm")) +
theme(axis.title.x = element_text(family = plot_family, size = rel(2)*plot_size_ratio)) +
guides(fill = guide_legend(override.aes = list(colour = NULL))) +
guides(fill = guide_legend(nrow = 1, byrow = TRUE))
if (y.log10) {
p <- p + scale_y_continuous(trans="log10")
}
if (y.log2) {
p <- p + scale_y_continuous(trans="log2")
}
if (add.geom_text) {
p <- p + geom_text(aes(label=format_values_2(value), y=0), size=0.75, hjust='left')
}
if (!plot.axis.text.y) {
p <- p + theme(axis.text.y=element_blank())
}
return(p)
}
Function to generate a formatted taxa names from samples sheet.
# Format for plotting
# See https://yulab-smu.top/treedata-book/faq.html?q=rename#faq-formatting-label
format_sp_name = function(x){
n = x[["genera"]]
# Add species if present
if(x[["species"]] != ""){
n = paste(n, x[["species"]], sep="~")
}
# Either print with extra info (not italics) or not
if(x[["extra"]] != ""){
e = str_replace_all(x[["extra"]], " ", "~")
e = str_replace_all(e, ",", "")
lab = glue(paste("italic(",n,")~",e, sep=''))
} else {
lab = glue(paste("italic(",n,")", sep=''))
}
return(lab)
}
Load the cleaned and processed datasets into R for plotting.
species.tree <- read.iqtree(
"../01_Orthofinder/Run2.SpeciesTree_rooted_node_labels.tre")
## Warning in sub("/.*", "", nlabel) %>% as.numeric: NAs introduced by coercion
## Warning in sub(".*/", "", nlabel) %>% as.numeric: NAs introduced by coercion
species.names <- as.phylo(species.tree)$tip.label
samples <- read.csv("../samples.txt", header=TRUE, sep='\t') %>%
filter(sample_id %in% species.names)
genome.stats <- read.table(
"all_genomes-01_stats-results.tsv", sep='\t',
header=TRUE, check.names=FALSE, stringsAsFactors=FALSE) %>%
filter(SampleID %in% species.names)
transcriptome.stats <- read.table(
"all_transcriptomes-01_stats-results.tsv", sep='\t',
header=TRUE, check.names=FALSE, stringsAsFactors=FALSE) %>%
filter(SampleID %in% species.names) %>%
rename(`Number of genes` = `Number of CDS`)
genomeData.busco.genome.eukaryota <- read.table(
"all_genomes-02_busco-genome.fa.busco_eukaryota_odb10-results.tsv", sep='\t',
header=TRUE, check.names=FALSE) %>%
filter(SampleID %in% species.names)
genomeData.busco.genome.metazoa <- read.table(
"all_genomes-02_busco-genome.fa.busco_metazoa_odb10-results.tsv", sep='\t',
header=TRUE, check.names=FALSE) %>%
filter(SampleID %in% species.names)
genomeData.busco.protein.eukaryota <- read.table(
"all_genomes-02_busco-pep.faa.busco_eukaryota_odb10-results.tsv", sep='\t',
header=TRUE, check.names=FALSE) %>%
filter(SampleID %in% species.names)
genomeData.busco.protein.metazoa <- read.table(
"all_genomes-02_busco-pep.faa.busco_metazoa_odb10-results.tsv", sep='\t',
header=TRUE, check.names=FALSE) %>%
filter(SampleID %in% species.names)
transcriptomeData.busco.protein.eukaryota <- read.table(
"all_transcriptomes-02_busco-pep.faa.busco_eukaryota_odb10-results.tsv", sep='\t',
header=TRUE, check.names=FALSE) %>%
filter(SampleID %in% species.names)
transcriptomeData.busco.protein.metazoa <- read.table(
"all_transcriptomes-02_busco-pep.faa.busco_metazoa_odb10-results.tsv", sep='\t',
header=TRUE, check.names=FALSE) %>%
filter(SampleID %in% species.names)
Merge transcriptome and genome datasets together so that all species from the tree are represented (species without this data get zero values).
# Genome/Transcriptome stats
seq.stats <- merge(genome.stats, transcriptome.stats, all=TRUE) %>% replace(is.na(.), 0)
# BUSCO results
busco.protein.eukaryota <- rbind(genomeData.busco.protein.eukaryota, transcriptomeData.busco.protein.eukaryota)
busco.protein.metazoa <- rbind(genomeData.busco.protein.metazoa, transcriptomeData.busco.protein.metazoa)
busco.genome.eukaryota <- data.frame(SampleID=species.names)
busco.genome.eukaryota <- merge(x=busco.genome.eukaryota, y=genomeData.busco.genome.eukaryota, all.x=TRUE) %>% replace(is.na(.), 0)
busco.genome.metazoa <- data.frame(SampleID=species.names)
busco.genome.metazoa <- merge(x=busco.genome.metazoa, y=genomeData.busco.genome.metazoa, all.x=TRUE) %>% replace(is.na(.), 0)
Extract the sample order from dendrogram (using “working” sample names). Also load pretty sample names and replace the labels in the tree with these new names.
Global setting for plotting
plot_size_ratio <- 0.2
plot.axis.text.y <- FALSE
plot.legend <- FALSE
Data.frame with clade positions in tree and colors.
# Dataframe of clades to highlight
clade.colors <- data.frame(
node = c(132, 138, 144, 146, 148, 164, 171, 172, 183, 179, 130, 126, 122, 124, 134, 136, 133, 56, 57),
fill = c("blue", "red", "brown", "pink", "yellow", "purple", "green", "orange", "blue", "red", "purple", "green", "yellow", "red", "blue", "blue", "yellow", "green", "blue"),
alpha = c(0.2, 0.4, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5, 0.5, 0.1, 0.2, 0.4, 0.2),
n = c("Cnidaria", "Staurozoa", "Scyphozoa", "Cubozoa", "Hydrozoa", "Octocorallia", "Hexacorallia", "Actiniaria", "Scleractinia", "Corallimorpharia", "Placozoa", "Porifera", "Ctenophora", "Choanoflagellata", "Myxozoa", "Medusozoa", "Endocnidozoa", "Zoantharia", "Scleractinia")
)
Format species tree. Color clades and rename taxa into a pretty format (i.e., genera+species italics + extra info not italics). - Show tree with full taxonomy to verify that our clades are in the correct position.
samples$name <- apply(samples, 1, format_sp_name )
p <- ggtree(species.tree) %<+% samples +
geom_highlight(
data = clade.colors,
mapping = aes(
node=node,
fill=I(fill),
alpha=I(alpha),
),
extendto = 1.65,
to.bottom=TRUE,
) +
geom_cladelab(
data = clade.colors,
mapping = aes(
node=node,
label=n,
),
align = TRUE,
offset.text = 0.0,
hjust = "center",
fontsize = 2,
offset = 0.5,
barsize = 0,
fontface="bold",
) +
geom_tiplab(aes(label=name), size=6*plot_size_ratio, linesize=0.2, parse=TRUE, align=TRUE, offset=0.01) +
geom_tiplab(aes(label=order), size=6*plot_size_ratio, linesize=0.0, parse=TRUE, align=TRUE, offset=0.18) +
geom_tiplab(aes(label=subclass), size=6*plot_size_ratio, linesize=0.0, parse=TRUE, align=TRUE, offset=0.25) +
geom_tiplab(aes(label=class), size=6*plot_size_ratio, linesize=0.0, parse=TRUE, align=TRUE, offset=0.33) +
geom_tiplab(aes(label=phylum), size=6*plot_size_ratio, linesize=0.0, parse=TRUE, align=TRUE, offset=0.40) +
geom_tippoint(aes(color=data_type), size=6*plot_size_ratio, show.legend=FALSE) +
hexpand(0.1)
p + geom_text(aes(label=node))
Generate data plots that we will combine together in the final plot.
p.genome.stats.scafs <- plot_barchart(data=seq.stats,
column.id="Number of scaffolds",
plot_title="No. scaffolds",
xlab="Number of scaffolds",
bar.fill="#fdb462",
plot.axis.text.y=plot.axis.text.y,
plot_size_ratio=plot_size_ratio)
p.genome.stats.bases <- plot_barchart(data=seq.stats,
column.id="Total scaffold length (bp)",
plot_title="Total bases",
xlab="Bases (Mbp)",
bar.fill="#fb8072",
divide.by=1000000,
plot.axis.text.y=plot.axis.text.y,
plot_size_ratio=plot_size_ratio)
p.genome.stats.N50 <- plot_barchart(data=seq.stats,
column.id="N50 of scaffolds (bp)",
plot_title="Assembly scaffold N50",
xlab="Scaffold N50 (Kbp)",
bar.fill="#bebada",
divide.by=1000,
plot.axis.text.y=plot.axis.text.y,
plot_size_ratio=plot_size_ratio)
p.gene.stats.count <- plot_barchart(data=seq.stats,
column.id="Number of genes",
plot_title="Protein-coding genes",
xlab="Number of genes",
bar.fill="#d9d9d9",
plot.axis.text.y=plot.axis.text.y,
plot_size_ratio=plot_size_ratio)
p.busco.genome.eukaryota <- plot_BUSCO_percentages(
data=busco.genome.eukaryota,
plot_title="BUSCO Eukaryota - Genome",
plot.axis.text.y=plot.axis.text.y,
plot.legend=plot.legend,
plot_size_ratio=plot_size_ratio)
p.busco.genome.metazoa <- plot_BUSCO_percentages(
data=busco.genome.metazoa,
plot_title="BUSCO Metazoa - Genome",
plot.axis.text.y=plot.axis.text.y,
plot.legend=plot.legend,
plot_size_ratio=plot_size_ratio)
p.busco.protein.eukaryota <- plot_BUSCO_percentages(
data=busco.protein.eukaryota,
plot_title="BUSCO Eukaryota - Protein",
plot.axis.text.y=plot.axis.text.y,
plot.legend=plot.legend,
plot_size_ratio=plot_size_ratio)
p.busco.protein.metazoa <- plot_BUSCO_percentages(
data=busco.protein.metazoa,
plot_title="BUSCO Metazoa - Protein",
plot.axis.text.y=plot.axis.text.y,
plot.legend=plot.legend,
plot_size_ratio=plot_size_ratio)
Plot final combined panel plot.
samples$name <- apply(samples, 1, format_sp_name )
p <- ggtree(species.tree) %<+% samples +
geom_highlight(
data = clade.colors,
mapping = aes(
node=node,
fill=I(fill),
alpha=I(alpha),
),
extendto = 1.400, # Change position of colored boxes
to.bottom=TRUE,
) +
geom_cladelab(
data = clade.colors,
mapping = aes(
node=node,
label=n,
),
align = TRUE,
offset.text = 0.0,
hjust = "center",
fontsize = 2,
offset = 0.2, # Push everything to the right
barsize = 0,
fontface="bold",
) +
geom_tiplab(aes(label=name), size=6*plot_size_ratio, linesize=0.2, parse=TRUE, align=TRUE, offset=0.01) +
geom_tippoint(aes(color=data_type), size=6*plot_size_ratio, show.legend=FALSE) +
hexpand(.1)
p.gene.stats.count %>%
insert_left(p, width=6) %>%
insert_right(p.genome.stats.scafs) %>%
insert_right(p.genome.stats.bases) %>%
insert_right(p.genome.stats.N50) %>%
insert_right(p.busco.genome.eukaryota) %>%
insert_right(p.busco.genome.metazoa) %>%
insert_right(p.busco.protein.eukaryota) %>%
insert_right(p.busco.protein.metazoa)